library(MASS)
library(nnet)
library(Matching)
library(foreach)
library(doParallel)
library(CBPS)
library(boot)
library(ggplot2)
library(gbm)


# Calculates the F statisti of the model before and after weighting, both
# covariate-wise and for the whole model
getbal<-function(dat, covars, wts){
  out <- rep(0,1 + 2*ncol(covars))
  out[1:ncol(covars)] <- coef(summary(lm(treat ~ covars, data = dat, weights = wts)))[-1, "t value"]^2
  for (i in 1:ncol(covars)){
    out[ncol(covars)+i] <- summary(lm(treat ~ covars[,i], data = dat, weights = wts))$fstatistic[1]
  }
  out[2*ncol(covars) + 1] <- summary(lm(treat ~ covars, data = dat, weights = wts))$fstatistic[1]
  return(out)
}

# Edit paths below to point to the directory that contains this file
# Load replication data
load("FECForCEM.RData") 
# Load bootstraps
boot.data<-read.csv("UNboot.csv", header = FALSE)

# Find the Box-Cox transformation of the treatment
# that makes its distribution closest to normal
max.norm.cor<-cor(qqnorm(x$TotAds)$x,qqnorm(x$TotAds)$y)
best.bc<-x$TotAds
best.lambda<-1
for (lambda in seq(-2,2,0.01))
{
  if (lambda != 0)
  {
    bc.totads<-((x$TotAds+1)^lambda - 1)/lambda
  }
  else
  {
    bc.totads<-log(x$TotAds+1)
  }
  norm.cor<-cor(qqnorm(bc.totads)$x,qqnorm(bc.totads)$y)
  if (norm.cor > max.norm.cor)
  {
    max.norm.cor<-norm.cor
    best.bc<-bc.totads
    best.lambda<-lambda
  }
}

x$treat<-best.bc

# Define the treatment model and the covariates associated with it
nfe.pscore.form <- (treat ~ log(TotalPop) + PercentOver65 + log(Inc + 1) + PercentHispanic + PercentBlack + density + 
                      per_collegegrads + CanCommute + I(log(TotalPop)^2) + I(PercentOver65^2) + I(log(Inc + 1)^2) + 
                      I(PercentHispanic^2) + I(PercentBlack^2) + I(density^2) + I(per_collegegrads^2))
covars <- model.matrix(~ -1 + log(TotalPop) + PercentOver65 + log(Inc + 1) + PercentHispanic + PercentBlack + density + 
                         per_collegegrads + CanCommute + I(log(TotalPop)^2) + I(PercentOver65^2) + I(log(Inc + 1)^2) + 
                         I(PercentHispanic^2) + I(PercentBlack^2) + I(density^2) + I(per_collegegrads^2),
                       data = x)

F.aac.iter=function(i,ps.model,data,ps.num,rep,criterion) { 
  
  GBM.fitted=predict(ps.model,newdata=data,n.trees=floor(i),type="response") 
  ps.den=dnorm((data$T-GBM.fitted)/sd(data$T-GBM.fitted),0,1)
  wt=ps.num/ps.den
  aac_iter=rep(NA,rep)
  for (i in 1:rep){
    bo=sample(1:dim(data)[1],replace=TRUE,prob=wt)
    newsample=data[bo,]
    j.drop=match(c("T"),names(data))
    j.drop=j.drop[!is.na(j.drop)]
    x=newsample[,-j.drop]
    if(criterion=="spearman"|criterion=="kendall"){
      ac=apply(x, MARGIN=2, FUN=cor, y=newsample$T,method=criterion)
    } else if (criterion=="distance"){
      ac=apply(x, MARGIN=2, FUN=dcor, y=newsample$T) 
    } else if (criterion=="pearson"){
      ac=matrix(NA,dim(x)[2],1)
      for (j in 1:dim(x)[2]){
        ac[j]=ifelse (!is.factor(x[,j]), cor(newsample$T, x[,j], 
                                             method=criterion),polyserial(newsample$T, x[,j]))
      }
    } else print("The criterion is not correctly specified")
    aac_iter[i]=mean(abs(1/2*log((1+ac)/(1-ac))),na.rm=TRUE)
  }
  aac=mean(aac_iter)
  return(aac)
}

# Started at 9:24 AM
mydata=data.frame(T=x$treat,X=covars)
model.num=lm(T~1,data=mydata)
ps.num=dnorm((mydata$T-model.num$fitted)/(summary(model.num))$sigma,0,1)
model.den=gbm(T~.,data=mydata, shrinkage = 0.0005, interaction.depth=4, distribution="gaussian",n.trees=20000)
opt=optimize(F.aac.iter,interval=c(1,20000),ps.model=model.den,data=mydata,ps.num=ps.num,rep=50,criterion="pearson")
best.aac.iter=opt$minimum
best.aac=opt$objective
model.den$fitted=predict(model.den,newdata=mydata,n.trees=floor(best.aac.iter),type="response")   
ps.den=dnorm((mydata$T-model.den$fitted)/sd(mydata$T-model.den$fitted),0,1)
weight.gbm=ps.num/ps.den
# Done 9:33 AM

# Fit CBPS and npCBPS
pscorefit.nfe.exact<-CBPS(nfe.pscore.form, data = x, twostep = TRUE, method = "exact")
pscorefit.nfe.np<-npCBPS(nfe.pscore.form, data = x, corprior=0.1/nrow(x))

# Fit MLE
pscorefit.nfe.mle<-lm(nfe.pscore.form, data = x, x = TRUE)
mle.nfe.sigmasq<-mean((x$treat - fitted(pscorefit.nfe.mle))^2)
mle.nfe.stabilizers<-sapply(x$treat, function(t) mean(dnorm(t, mean = fitted(pscorefit.nfe.mle), sd = sqrt(mle.nfe.sigmasq))))
pscorefit.mle.nfe.weights<-mle.nfe.stabilizers/dnorm(x$treat, mean = fitted(pscorefit.nfe.mle), sd = sqrt(mle.nfe.sigmasq))

mle.nfe.fit<-pscorefit.nfe.exact
mle.nfe.fit$weights<-pscorefit.mle.nfe.weights

gbm.nfe.fit<-pscorefit.nfe.exact
gbm.nfe.fit$weights <- weight.gbm

# Replicate dichotomized matching
matchout<-Match(Y=(x$Cont*1000 + 1), Tr=x$Treatment1, X=x$PropScore1, M=1)
getWeights<-function(match.out, n)
{
  weights<-rep(0,n)
  weights<-weights+sapply(1:n, function(x) sum(match.out$weights*(match.out$index.treated == x)))
  weights<-weights+sapply(1:n, function(x) sum(match.out$weights*(match.out$index.control == x)))
  weights
}
matchfit <- pscorefit.nfe.exact
matchfit$weights <- getWeights(matchout, length(pscorefit.nfe.exact$weights))

# Compare balance of various weighting/matching schemes
unwt.balstats <- getbal(x, covars, rep(1, length(pscorefit.mle.nfe.weights)))
match.balstats <- getbal(x, covars, matchfit$weights)
mle.balstats <- getbal(x, covars, pscorefit.mle.nfe.weights)
exact.balstats <- getbal(x, covars, pscorefit.nfe.exact$weights)
np.balstats <- getbal(x, covars, pscorefit.nfe.np$weights)
gbm.balstats <- getbal(x, covars, weight.gbm)

#plot(y = c(abs(balance(pscorefit.nfe.np)$balanced), 
#           abs(balance(pscorefit.nfe.exact)$balanced),
#           abs(balance(mle.nfe.fit)$balanced),
#           abs(balance(matchfit)$balanced),
#           abs(balance(mle.nfe.fit)$unweighted)),
#     x = c(rep(7.5, ncol(covars)), rep(6, ncol(covars)), rep(4.5, ncol(covars)), 
#           rep(3, ncol(covars)), rep(1.5, ncol(covars))),
#     xaxt='n', ylab="Absolute Pearson Correlations", main="Absolute Pearson Correlations of Covariates Post-Weighting", xlab = "", 
#     xlim = c(1,8), pch = 16)
#axis(1, at = c(7.5, 6, 4.5, 3, 1.5), labels = c("npCBPS", "CBPS", "MLE", "Dichotomized", "Unweighted"))

# Figure 3
par(mfrow=c(1,2), cex = 1.3, mar=c(5.5,4.1,4.1,2.1))
boxplot(c(abs(balance(pscorefit.nfe.np)$unweighted)), c(abs(balance(matchfit)$balanced)), 
        c(abs(balance(mle.nfe.fit)$balanced)), c(abs(balance(gbm.nfe.fit)$balanced)), 
        c(abs(balance(pscorefit.nfe.exact)$balanced)), 
        c(abs(balance(pscorefit.nfe.np)$balanced)), 
        xaxt = "n",
        ylab="Absolute Pearson Correlations", main="Absolute Pearson Correlations of Covariates")
axis(1, at = 1:6, labels = c("Unweighted", "Matching", "MLE", "GBM", "CBGPS", "npCBGPS"), las = 2)

boxplot(unwt.balstats[16:30]^.25, match.balstats[16:30]^.25,
        mle.balstats[16:30]^.25, gbm.balstats[16:30]^.25, 
        exact.balstats[16:30]^.25, np.balstats[16:30]^.25, 
        xaxt = "n",
        yaxt = 'n',
        ylab = "F-statistic", main = "F-Statistic of Regressing Treatment on Each Covariate")
axis(1, at = 1:6, labels = c("Unweighted", "Matching", "MLE", "GBM", "CBGPS", "npCBGPS"), las = 2)
axis(2, at = c(0, 1, 10, 100, 1000, 10000)^.25, labels = c(0, 1, 10, 100, 1000, 10000))
par(mfrow = c(1,1), cex = 1, mar=c(5.1,4.1,4.1,2.1))

library(xtable)
xtable(data.frame(Unweighted = balance(gbm.nfe.fit)$unweighted, 
                  MLE = balance(mle.nfe.fit)$balanced, 
                  GBM = balance(gbm.nfe.fit)$balanced,
                  CBGPS = balance(pscorefit.nfe.exact)$balanced, 
                  npCBGPS = balance(pscorefit.nfe.np)$balanced), digits = 3)

# Get in-sample F-statistic
unwt.balstats[31]
match.balstats[31]
mle.balstats[31]
gbm.balstats[31]
exact.balstats[31]
np.balstats[31]

#plot(x = c(rep(1.5, ncol(covars)), rep(3, ncol(covars)), 
#           rep(4.5, ncol(covars)), rep(6, ncol(covars)),
#           rep(7.5, ncol(covars))), 
#     y = c(unwt.balstats[16:30]^.25, match.balstats[16:30]^.25,
#           mle.balstats[16:30]^.25, exact.balstats[16:30]^.25, 
#           np.balstats[16:30]^.25),
#     xaxt = 'n', ylab = "F-statistic", xlab = "", ylim = c(0, 10),
#     xlim = c(1,8), yaxt = 'n', main = "F-Statistic of Regressing the Treatment on Each Covariate Post-Weighting", 
#     pch = 16)
#axis(1, at = c(7.5,6,4.5,3,1.5), labels = c("npCBPS", "CBPS", "MLE", "Matching", "Unweighted"))
#axis(2, at = c(0, 1, 10, 100, 1000, 10000)^.25, labels = c(0, 1, 10, 100, 1000, 10000))

#boxplot(boot.data[,63]^.25, boot.data[,125]^.25, boot.data[,156]^.25,
#        names = c("MLE", "CBGPS", "npCBGPS"), ylab = "F-statistic",
#        pch = 16, yaxt = 'n', ylim = c(0,10))
#axis(2, at = c(0, 1, 10, 100, 1000, 10000)^.25, labels = c(0, 1, 10, 100, 1000, 10000))

summary(as.vector(as.matrix(boot.data[,63])))
summary(as.vector(as.matrix(boot.data[,94])))
summary(as.vector(as.matrix(boot.data[,125])))
summary(as.vector(as.matrix(boot.data[,156])))

#boxplot(as.vector(as.matrix(boot.data[,48:62]^.25)), 
#        as.vector(as.matrix(boot.data[,110:124]^.25)), as.vector(as.matrix(boot.data[,131:155]^.25)),
#        names = c("MLE", "CBGPS", "npCBGPS"), ylab = "F-statistic",
#        pch = 16, ylim = c(0, 10), yaxt = 'n')
#axis(2, at = c(0, 1, 10, 100, 1000, 10000)^.25, labels = c(0, 1, 10, 100, 1000, 10000))

# Get point estimates that account for uncertainty in the weights (first column of Table 1)
fit.outcome<-function(dat, wts, best.lambda){
  outfit<-lm(Cont ~ treat + I(treat^2) + factor(StFIPS), data = x, weights=wts)
  
  dose <- ifelse(best.lambda != 0, ((1000+1)^best.lambda-1)/best.lambda, log(1000+1))
  dose.eff <- coef(outfit)[2]*dose + coef(outfit)[3]*dose^2
  
  return(dose.eff)
}

eff.mle <- fit.outcome(x, pscorefit.mle.nfe.weights, best.lambda)
eff.np <- fit.outcome(x, pscorefit.nfe.np$weights, best.lambda)
eff.exact <- fit.outcome(x, pscorefit.nfe.exact$weights, best.lambda)
eff.gbm <- fit.outcome(x, weight.gbm, best.lambda)
eff.mle*1000 # MLE
eff.exact*1000 # CBPS
eff.np*1000 # npCBPS
eff.gbm*1000 # GBM

mean(boot.data[,157])*1000 # MLE
mean(boot.data[,158])*1000 # CBPS
mean(boot.data[,159])*1000 # npCBPS
mean(boot.data[,160])*1000 # GBM

# Get standard deviations that account for uncertainty in the weights (second column of Table 1)
sd(boot.data[,157])*1000 # MLE
sd(boot.data[,158])*1000 # CBPS
sd(boot.data[,159])*1000 # npCBPS
sd(boot.data[,160])*1000 # GBM

# Get confidence intervals that account for uncertainty in the weights (Third column of Table 1)
summary(matchout)
quantile(boot.data[,157], c(.025, .975))*1000 # MLE
quantile(boot.data[,158], c(.025, .975))*1000 # CBPS
quantile(boot.data[,159], c(.025, .975))*1000 # npCBPS
quantile(boot.data[,160], c(.025, .975))*1000 # GBM

outfit <- lm(Cont ~ treat + I(treat^2) + factor(StFIPS), data = x, weights=cbpsfit$weights,
             x = TRUE, y = TRUE)
outfit.vcov <- vcov_outcome.CBPSContinuous(pscorefit.nfe.exact, outfit$y, outfit$x, coef(outfit))

# Standard error for 1000 ad dose
dose<-ifelse(best.lambda != 0, ((1000+1)^best.lambda-1)/best.lambda, log(1000+1))

dose.eff <- (dose*coef(outfit)[2] + dose^2*coef(outfit)[3])*1000
dose.stderr <- sqrt(dose^2*outfit.vcov[2,2] + dose^4*outfit.vcov[3,3] + 2*dose^3*outfit.vcov[2,3])*1000
dose.stderr
c(dose.eff - 1.96*dose.stderr, dose.eff + 1.96*dose.stderr)

# Plot confidence intervals that account for uncertainty in the weights
#par(mfrow=c(1,1))
#plot(x = c(1.5, 3, 4.5), y = c(mean(boot.data[,226]), mean(boot.data[,426]), mean(boot.data[,526])), 
#     xlim = c(1,5), ylim = c(-5000,20000), xlab = "", ylab = "Extra Contributions for 1000 Ads", xaxt = "n")
#axis(1, at = c(1.5, 3, 4.5), labels = c("MLE", "CBPS", "npCBPS"))
#lines(x = c(1.5,1.5), y = quantile(boot.data[,226], c(.025, .975)))
#lines(x = c(3,3), y = quantile(boot.data[,426], c(.025, .975)))
#lines(x = c(4.5,4.5), y = quantile(boot.data[,526], c(.025, .975)))
#abline(0,0, lty="dashed")